Text-处理步骤
一些可能用到的libray:
from bs4 import BeautifulSoup as bsoup
import re
import os
import nltk
from nltk.collocations import *
from itertools import chain
import itertools
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import reuters
全转换为小写:
raw_text = raw_text.lower()
Word Tokenization:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r"\w+(?:[-.]\w+)?")
unigram_tokens = tokenizer.tokenize(raw_text)
Tokenize the raw_tex with multi-word expressions.(in a list):
例如:uni_voc = list(set(unigram_tokens))
uni_voc.append(('lobe', 'nodule'))
from nltk.tokenize import MWETokenizer
mwe_tokenizer = MWETokenizer(uni_voc)
mwe_tokens = mwe_tokenizer.tokenize(unigram_tokens)
Stop Words Removal:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
stopped_tokens = [w for w in mwe_tokens if w not in stopwords_set]
获得各词词性:
import nltk.data
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = sent_detector.tokenize(raw_text.strip())
tagged_sents = []
for sent in sentences:
uni_sent = tokenizer.tokenize(sent)
mwe_text = mwe_tokenizer.tokenize(uni_sent)
tagged_sent = nltk.tag.pos_tag(mwe_text)
stopped_tagged_sent = [x for x in tagged_sent if x[0] not in stopwords_set]
tagged_sents.append(stopped_tagged_sent)
print (tagged_sents)